library(janitor)
package ‘janitor’ was built under R version 3.6.2replacing previous import ‘vctrs::data_frame’ by ‘tibble::data_frame’ when loading ‘dplyr’
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
library(fastDummies)
library(broom)
package ‘broom’ was built under R version 3.6.2
library(rpart)
library(rpart.plot)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[37m── [1mAttaching packages[22m ────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[37m[32m✓[37m [34mggplot2[37m 3.3.2 [32m✓[37m [34mpurrr [37m 0.3.4
[32m✓[37m [34mtibble [37m 3.0.3 [32m✓[37m [34mdplyr [37m 1.0.0
[32m✓[37m [34mtidyr [37m 1.1.0 [32m✓[37m [34mstringr[37m 1.4.0
[32m✓[37m [34mreadr [37m 1.3.1 [32m✓[37m [34mforcats[37m 0.5.0[39m
package ‘ggplot2’ was built under R version 3.6.2package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2package ‘dplyr’ was built under R version 3.6.2[37m── [1mConflicts[22m ───────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(factoextra)
package ‘factoextra’ was built under R version 3.6.2Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
customer_data <- read_csv("data/mall_customers.csv")
Parsed with column specification:
cols(
CustomerID = [32mcol_double()[39m,
Gender = [31mcol_character()[39m,
Age = [32mcol_double()[39m,
`Annual Income (k$)` = [32mcol_double()[39m,
`Spending Score (1-100)` = [32mcol_double()[39m
)
customer_data <- customer_data %>%
clean_names()
summary(customer_data)
customer_id gender age annual_income_k
Min. : 1.00 Length:200 Min. :18.00 Min. : 15.00
1st Qu.: 50.75 Class :character 1st Qu.:28.75 1st Qu.: 41.50
Median :100.50 Mode :character Median :36.00 Median : 61.50
Mean :100.50 Mean :38.85 Mean : 60.56
3rd Qu.:150.25 3rd Qu.:49.00 3rd Qu.: 78.00
Max. :200.00 Max. :70.00 Max. :137.00
spending_score_1_100
Min. : 1.00
1st Qu.:34.75
Median :50.00
Mean :50.20
3rd Qu.:73.00
Max. :99.00
We are interested in creating a marketing campaign to target customers based on their spending score and annual income. Perform a k-means clustering to find if there are meaningful clusters in the data to target the customers.
ggplot(customer_data, aes(x = spending_score_1_100, y = annual_income_k)) +
geom_point()
Perform k-means clustering and chose a value of k.
customer <- customer_data %>%
select(spending_score_1_100, annual_income_k)
customer
summary(customer)
spending_score_1_100 annual_income_k
Min. : 1.00 Min. : 15.00
1st Qu.:34.75 1st Qu.: 41.50
Median :50.00 Median : 61.50
Mean :50.20 Mean : 60.56
3rd Qu.:73.00 3rd Qu.: 78.00
Max. :99.00 Max. :137.00
customer_scale <- customer %>%
mutate_all(scale)
customer_scale
clustered_customers <- kmeans(customer_scale, centers = 6, nstart = 25)
clustered_customers
K-means clustering with 6 clusters of sizes 23, 81, 22, 10, 29, 35
Cluster means:
spending_score_1_100 annual_income_k
1 -1.13411939 -1.3042458
2 -0.02638995 -0.2004097
3 1.12934389 -1.3262173
4 1.23143545 1.8709508
5 1.23811207 0.6850149
6 -1.28122394 1.0523622
Clustering vector:
[1] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
[42] 3 1 2 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[83] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[124] 5 6 5 2 5 6 5 6 5 2 5 6 5 6 5 6 5 6 5 2 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5
[165] 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6 4
Within cluster sum of squares by cluster:
[1] 7.577407 14.485632 5.217630 3.681858 5.514889 18.304646
(between_SS / total_SS = 86.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
library(broom)
tidy(clustered_customers)
NA
glance(clustered_customers)
augment(clustered_customers, customer)
library(animation)
customer_scale %>%
kmeans.ani(centers = 6)
Visualise the clustering for your chosen value of k.
# Set min & max number of clusters want to look at
max_k <- 20
k_clusters <- tibble(k = 1:max_k) %>%
mutate(
kclust = map(k, ~ kmeans(customer_scale, .x, nstart = 25)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, customer)
)
k_clusters
NA
clusterings <- k_clusters %>%
unnest(glanced)
clusterings
ggplot(clusterings, aes(x=k, y=tot.withinss)) +
geom_point() +
geom_line() +
scale_x_continuous(breaks = seq(1, 20, by = 1))
fviz_nbclust(customer_scale, kmeans, method = "wss", nstart = 25)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
fviz_nbclust(customer_scale, kmeans, method = "silhouette", nstart = 25)
fviz_nbclust(customer_scale, kmeans, method = "gap_stat", nstart = 25, k.max = 10)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
clusterings %>%
unnest(cols = c(augmented)) %>%
filter(k <= 5) %>%
ggplot(aes(x = spending_score_1_100, y = annual_income_k)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k)
clusterings %>%
unnest(cols = c(augmented)) %>%
filter(k == 5) %>%
ggplot(aes(x = spending_score_1_100, y = annual_income_k, colour = .cluster)) +
geom_point(aes(color = .cluster))
clusterings %>%
unnest(augmented) %>%
filter(k == 5) %>%
group_by(.cluster) %>%
summarise(mean(spending_score_1_100), mean(annual_income_k))
`summarise()` ungrouping output (override with `.groups` argument)
Do you think the clustering seems a good fit for this data?
Comment on the attributes on one or two of the clusters (maybe even give them a label if you like - like in section 4.1 of the ‘Segmentation & clustering intro’ lesson).
CODECLAN SOLUTION
library(tidyverse)
library(janitor)
customer_data %>%
group_by(gender) %>%
summarise(count = n()) %>%
ggplot(aes(x = gender, y = count)) +
geom_col()
`summarise()` ungrouping output (override with `.groups` argument)
ggplot(customer, aes(x = annual_income_k, y = spending_score_1_100)) +
geom_point()
ggplot(customer, aes(x = annual_income_k)) +
geom_histogram()
ggplot(customer, aes(x = spending_score_1_100)) +
geom_histogram()
ggplot(customer_data, aes(x = age)) +
geom_histogram(bins = 50)
customers_scale_1 <- customer %>%
select(annual_income_k, spending_score_1_100) %>%
mutate_all(scale)
summary(customers_scale_1)
annual_income_k.V1 spending_score_1_100.V1
Min. :-1.7346462 Min. :-1.9052398
1st Qu.:-0.7256883 1st Qu.:-0.5982918
Median : 0.0357895 Median :-0.0077449
Mean : 0.0000000 Mean : 0.0000000
3rd Qu.: 0.6640086 3rd Qu.: 0.8829160
Max. : 2.9103678 Max. : 1.8897500
ggplot(customers_scale_1, aes(x = annual_income_k, y = spending_score_1_100)) +
geom_point()
customer_clusters_k5 <- k_clusters %>%
unnest(cols = c(augmented)) %>%
filter(k == 5)